from numpy import array
import joblib
from re import sub
from jieba import cut

def getWordsFromFile(txtFile):
    # 获取每一封邮件中的所有词语
    words = []
    # 所有存储邮件文本内容的记事本文件都使用UTF8编码
    with open(txtFile, encoding='utf8') as fp:
        for line in fp:
            # 遍历每一行，删除两端的空白字符
            line = line.strip()
            # 过滤干扰字符或无效字符
            line = sub(r'[.【】0-9、—。，！~\*]', '', line)
            # 分词
            line = cut(line)
            # 过滤长度为1的词
            line = filter(lambda word: len(word)>1, line)
            # 把本行文本预处理得到的词语添加到words列表中
            words.extend(line)
    # 返回包含当前邮件文本中所有有效词语的列表
    return words


model = joblib.load("垃圾邮件分类器.pkl")
print('加载模型和训练结果成功。')
with open('topWords.txt', encoding='utf8') as fp:
    topWords = fp.read().split(',')

def predict(txtFile):
    # 获取指定邮件文件内容，返回分类结果
    words = getWordsFromFile(txtFile)
    currentVector = array(tuple(map(lambda x: words.count(x),
                                    topWords)))
    result = model.predict(currentVector.reshape(1, -1))[0]
    return '垃圾邮件' if result==1 else '正常邮件'

# 151.txt至155.txt为测试邮件内容
for mail in ('%d.txt'%i for i in range(151, 156)):
    print(mail, predict(mail), sep=':')
